I use lyrics_processed as the data of my analysis.“lyrics_processed” is a procesed corpus of 380,000+ song lyrics.

Here, we explore these data sets and try to find interesting patterns.

load all the required libraries

library("tidyverse")
library("tidytext")
library("plotly")
library("DT")
library("tm")
library("data.table")
library("scales")
library("ngram")
library("shiny")
library("qdap")
library("sentimentr")
library("gplots")
library("dplyr")
library("tm")
library("syuzhet")
library("factoextra")
library("beeswarm")
library("scales")
library("RColorBrewer")
library("RANN")
library("tm")
library("topicmodels")

This notebook was prepared with the following environmental settings.

print(R.version)
##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          3                           
## minor          6.1                         
## year           2019                        
## month          07                          
## day            05                          
## svn rev        76782                       
## language       R                           
## version.string R version 3.6.1 (2019-07-05)
## nickname       Action of the Toes

Load the processed lyrics data.

I use the processed lyrics data for analysis.

# load lyrics data
#load('../output/lyrics_processed.RData')
#dt_processed<-dt_processed%>%drop_na()

Data discriptions.

#dim(dt_processed)

The data has 125704 rows and 7 columns

Get the sentiment and number of lyrics of each song

#below are processes I got data from lyrics
#emotions=get_nrc_sentiment(dt_processed$lyrics)
#word.count=word_count(dt_processed$lyrics)
#lyrics.list=cbind(dt_processed,emotions,word.count)
#save(lyrics.list, file="../output/lyrics_list.RData")

#for convenience, I just load the data
load("../output/lyrics_list.RData")

I first had a look at the trendence of all the motions along the years.

#summary the number of each sentiment in each year
lyrics.list.time.emotions<-lyrics.list%>%
  select(year,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,negative,positive)%>%
  group_by(year)%>%
  summarise(anger=mean(anger),anticipation=mean(anticipation),
            disgust=mean(disgust),fear=mean(fear),
            joy=mean(joy),sadness=mean(sadness),surprise=mean(surprise),trust=mean(trust),
            negative=mean(negative),positive=mean(positive))
lyrics.list.time.emotions.ggplot<-lyrics.list.time.emotions%>%
  pivot_longer(2:11,names_to='emotion.type',values_to = 'emotion.count')

#plot
ggplot(lyrics.list.time.emotions.ggplot%>%filter(year>1995))+
  geom_line(aes(x=year,y=emotion.count,color=emotion.type))+
  scale_color_discrete("Average of emotions")+
  labs(x='Year',y='Number of emotions',title='Emotions in lyrics each year')+
  theme_light()+
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))

According to the plot, we can see that the sentiments of positive and negative are much high than all other emotions.

then I had a look at words count of lyrics along with the year 1996-2016

lyrics.list.wordcount<-lyrics.list%>%
  select(year,word.count)%>%
  filter(year>=1996)%>%
  mutate(year.new=paste('year',year))%>%
  mutate(year.new=factor(year.new),
         year.reorder=reorder(year.new,year,mean,order=T))
#plot
beeswarm(word.count ~ year.reorder, 
         data = lyrics.list.wordcount,
         horizontal = TRUE, 
         pch = 16, col = alpha(brewer.pal(9, "Set1"), 0.6), 
         cex = 0.5, cex.axis = 0.8, cex.lab = 0.8,
         spacing = .5/nlevels(lyrics.list.wordcount$year.reorder),
         las = 2, xlab = "Number of words in a song.", ylab = "",
         main = "Songs in year 1996-2016")

we can see that in year 2006-2007, the songs are published more than other periods.

Then I had a look at sentiments’ distributions in each genere

I did a comparison using shinyapp, for simplier visualization, I first summarized the data

preparation for visualization

lyrics.genre<-lyrics.list%>%
  select(genre,anger:positive)%>%
  group_by(genre)%>%
  summarise_if(is.numeric,mean)
lyrics.genre.simplified<-lyrics.genre%>%
  mutate(sum=anger+anticipation+disgust+fear+
           joy+sadness+surprise+trust+negative+positive)%>%
  arrange(desc(sum))
lyrics.genre.simplified

according to the table, I chose hip-hop, metal and folk three genres to show their sentiment distribution

genre_list<-c('Hip-Hop','Metal','Folk')
lyrics.genre.simplified<-lyrics.genre.simplified%>%
  filter(genre %in% genre_list)%>%
  select(1:11)

draw the rador plot with these three data

min=min(lyrics.genre.simplified[2:11])
max=max(lyrics.genre.simplified[2:11])
plot_ly(
  type = 'scatterpolar',
  fill = 'toself'
  ) %>%
  add_trace(
  r = as.numeric(lyrics.genre.simplified[1,2:11]),
  theta=as.character(names(lyrics.genre.simplified[2:11])),
  name = as.character(lyrics.genre.simplified$genre[1])
  ) %>%
  add_trace(
  r = as.numeric(lyrics.genre.simplified[2,2:11]),
  theta=as.character(names(lyrics.genre.simplified[2:11])),
  name = as.character(lyrics.genre.simplified$genre[2])
  ) %>%
  add_trace(
  r = as.numeric(lyrics.genre.simplified[3,2:11]),
  theta=as.character(names(lyrics.genre.simplified[2:11])),
  name = as.character(lyrics.genre.simplified$genre[3])
  ) %>%
  layout(
    polar = list(
      radialaxis = list(
        visible = T,
        range = c(min,max)
      )
    )
  )
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatterpolar mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

cluster the generes

according to the numbers of all the generes, I explored whether I can cluster the generes into a larger group.

heatmap.2(cor(lyrics.list%>%filter(genre=="Hip-Hop")%>%select(anger:trust)), 
          scale = "none", 
          col = bluered(100), , margin=c(4,4), key=F,
          trace = "none", density.info = "none")

par(mar=c(4, 6, 2, 1))
emo.means=colMeans(select(lyrics.list, anger:trust)>0.01)
col.use=c("red2", "darkgoldenrod1", 
            "chartreuse3", "blueviolet",
            "darkgoldenrod2", "dodgerblue3", 
            "darkgoldenrod1", "darkgoldenrod1")
barplot(emo.means[order(emo.means)], las=2, col=col.use[order(emo.means)], horiz=T, main="Hip Hop")

lyrics.summary<-tbl_df(lyrics.list)%>%
  group_by(genre)%>%
  summarise_if(is.numeric,mean)%>%
  select(-2,-3)
lyrics.summary<-as.data.frame(lyrics.summary)
rownames(lyrics.summary)=as.character((lyrics.summary[,1]))
km.res=kmeans(lyrics.summary[,-1],iter.max=200,3)
fviz_cluster(km.res,
             stand=F,repel=TRUE,
             data=lyrics.summary[,-1],xlab='',xaxt='n',
             show.clust.cent = FALSE)+theme_light()+
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))

Conclusion: we can see the genres are divided into three parts according to the emotions. We can see from the plot that Hip-hop is a genre of its own. Rock, country, electronic and jazz have similar styles of expressing emotions.Folk and popular music fall into a broad category based on the mood they contain.

reference